Import libraries
from IPython.display import display, Markdown
import plotly.express as px
from sem_covid.services.data_registry import Dataset
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.categorical_analyze import fast_categorical_analyze
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.confidence_interval_analysis import (
confidence_interval_with_mean, z_score_for_series, confidence_interval_for_proportion)
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.collision_analysis import (class_collision_in_columns,
class_collision)
ireland_action = Dataset.IRELAND_ACTION_TIMELINE.fetch()
CATEGORICAL_COLUMNS = ['keyword', 'page_type']
100% (410 of 410) |######################| Elapsed Time: 0:00:00 Time: 0:00:00
eda_result = fast_categorical_analyze(ireland_action, CATEGORICAL_COLUMNS ,"Irish Dataset")
| index | Absolute freq | |
|---|---|---|
| 0 | published_date | 13 |
| 1 | title | 3 |
| 2 | content | 5 |
| 3 | content_links | 139 |
| 4 | campaigns_links | 323 |
| 5 | part_of_links | 229 |
| 6 | documents | 348 |
| keyword | Relative freq | |
|---|---|---|
| 0 | crisis | 2.44 |
| 1 | social impact | 2.44 |
| 2 | aid programme | 2.44 |
| 3 | health risk | 2.44 |
| 4 | organisation of health care | 2.44 |
| 5 | working environment | 2.44 |
| 6 | protective equipment | 2.44 |
| 7 | e-Health | 2.44 |
| 8 | air transport | 2.20 |
| 9 | aid to disadvantaged groups | 2.20 |
| page_type | Relative freq | |
|---|---|---|
| 0 | Press release | 57.56 |
| 1 | Publication | 26.83 |
| 2 | Speech | 7.32 |
| 3 | News | 4.88 |
| 4 | Collection | 1.95 |
| 5 | 1.22 | |
| 6 | Form | 0.24 |
for key in eda_result.keys():
data = eda_result[key].copy()
column_name = data.columns[1]
zscore_column = data.columns[0]+'_z_score'
cumulative_freq = 'Cumulative freq'
diff_freq = 'Diff freq'
data[zscore_column] = round((data[column_name]-data[column_name].mean())/data[column_name].std(),2)
data[cumulative_freq] = data[column_name].cumsum()
data[diff_freq] = data[column_name].diff()
display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
display(data)
px.bar(data,x=data.columns[2],y=data.columns[0]).show()
px.bar(data,x=data.columns[0],y=data.columns[3]).show()
px.bar(data,x=data.columns[0],y=data.columns[4]).show()
Std deviation for [keyword] is [0.68]
| keyword | Relative freq | keyword_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | crisis | 2.44 | 1.37 | 2.44 | NaN |
| 1 | social impact | 2.44 | 1.37 | 4.88 | 0.00 |
| 2 | aid programme | 2.44 | 1.37 | 7.32 | 0.00 |
| 3 | health risk | 2.44 | 1.37 | 9.76 | 0.00 |
| 4 | organisation of health care | 2.44 | 1.37 | 12.20 | 0.00 |
| ... | ... | ... | ... | ... | ... |
| 61 | medical research | 0.49 | -1.52 | 99.06 | 0.00 |
| 62 | hospital infection | 0.24 | -1.88 | 99.30 | -0.25 |
| 63 | economic aid | 0.24 | -1.88 | 99.54 | 0.00 |
| 64 | endemic disease | 0.24 | -1.88 | 99.78 | 0.00 |
| 65 | restriction of liberty | 0.24 | -1.88 | 100.02 | 0.00 |
66 rows × 5 columns
Std deviation for [page_type] is [21.16]
| page_type | Relative freq | page_type_z_score | Cumulative freq | Diff freq | |
|---|---|---|---|---|---|
| 0 | Press release | 57.56 | 2.04 | 57.56 | NaN |
| 1 | Publication | 26.83 | 0.59 | 84.39 | -30.73 |
| 2 | Speech | 7.32 | -0.33 | 91.71 | -19.51 |
| 3 | News | 4.88 | -0.44 | 96.59 | -2.44 |
| 4 | Collection | 1.95 | -0.58 | 98.54 | -2.93 |
| 5 | 1.22 | -0.62 | 99.76 | -0.73 | |
| 6 | Form | 0.24 | -0.66 | 100.00 | -0.98 |
for key in eda_result.keys():
data = eda_result[key].copy()
n = data.size
tmp_s = data[data.columns[1]].copy()
tmp_s/=100
ci_mean = confidence_interval_with_mean(tmp_s)
display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
data["Confidence Interval"]= confidence_interval_for_proportion(tmp_s)
data["z_score"] = z_score_for_series(tmp_s)
display(data)
display(Markdown(f"Overrepresented records from column : {key}"))
rel_f = 'Relative freq'
display(data.loc[data[rel_f]>ci_mean[1]])
display(Markdown(f"Normal represented records from column : {key}"))
display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
display(Markdown(f"Underrepresented records from column : {key}"))
display(data.loc[data[rel_f]<ci_mean[0]])
Confidence Interval for keyword is : [1.35%, 1.68%]
| keyword | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | crisis | 2.44 | [0.0, 6.16] | 1.38 |
| 1 | social impact | 2.44 | [0.0, 6.16] | 1.38 |
| 2 | aid programme | 2.44 | [0.0, 6.16] | 1.38 |
| 3 | health risk | 2.44 | [0.0, 6.16] | 1.38 |
| 4 | organisation of health care | 2.44 | [0.0, 6.16] | 1.38 |
| ... | ... | ... | ... | ... |
| 61 | medical research | 0.49 | [0.0, 2.17] | -1.53 |
| 62 | hospital infection | 0.24 | [0.0, 1.42] | -1.90 |
| 63 | economic aid | 0.24 | [0.0, 1.42] | -1.90 |
| 64 | endemic disease | 0.24 | [0.0, 1.42] | -1.90 |
| 65 | restriction of liberty | 0.24 | [0.0, 1.42] | -1.90 |
66 rows × 4 columns
Overrepresented records from column : keyword
| keyword | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | crisis | 2.44 | [0.0, 6.16] | 1.38 |
| 1 | social impact | 2.44 | [0.0, 6.16] | 1.38 |
| 2 | aid programme | 2.44 | [0.0, 6.16] | 1.38 |
| 3 | health risk | 2.44 | [0.0, 6.16] | 1.38 |
| 4 | organisation of health care | 2.44 | [0.0, 6.16] | 1.38 |
| 5 | working environment | 2.44 | [0.0, 6.16] | 1.38 |
| 6 | protective equipment | 2.44 | [0.0, 6.16] | 1.38 |
| 7 | e-Health | 2.44 | [0.0, 6.16] | 1.38 |
| 8 | air transport | 2.20 | [0.0, 5.74] | 1.02 |
| 9 | aid to disadvantaged groups | 2.20 | [0.0, 5.74] | 1.02 |
| 10 | respiratory disease | 2.20 | [0.0, 5.74] | 1.02 |
| 11 | innovation | 2.20 | [0.0, 5.74] | 1.02 |
| 12 | economic activity | 2.20 | [0.0, 5.74] | 1.02 |
| 13 | social well-being | 2.20 | [0.0, 5.74] | 1.02 |
| 14 | tourism | 2.20 | [0.0, 5.74] | 1.02 |
| 15 | patient safety | 2.20 | [0.0, 5.74] | 1.02 |
| 16 | social media | 2.20 | [0.0, 5.74] | 1.02 |
| 17 | vaccination | 1.95 | [0.0, 5.29] | 0.65 |
| 18 | EU financing | 1.95 | [0.0, 5.29] | 0.65 |
| 19 | health control | 1.95 | [0.0, 5.29] | 0.65 |
| 20 | basic needs | 1.95 | [0.0, 5.29] | 0.65 |
| 21 | quality of life | 1.95 | [0.0, 5.29] | 0.65 |
| 22 | free movement of workers | 1.95 | [0.0, 5.29] | 0.65 |
| 23 | public awareness campaign | 1.95 | [0.0, 5.29] | 0.65 |
| 24 | applied research | 1.95 | [0.0, 5.29] | 0.65 |
| 25 | health legislation | 1.95 | [0.0, 5.29] | 0.65 |
| 26 | covid | 1.71 | [0.0, 4.84] | 0.29 |
| 27 | working conditions | 1.71 | [0.0, 4.84] | 0.29 |
| 28 | crisis management | 1.71 | [0.0, 4.84] | 0.29 |
| 29 | living conditions | 1.71 | [0.0, 4.84] | 0.29 |
| 30 | pandemic | 1.71 | [0.0, 4.84] | 0.29 |
Normal represented records from column : keyword
| keyword | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 31 | communications policy | 1.46 | [0.0, 4.35] | -0.08 |
| 32 | health policy | 1.46 | [0.0, 4.35] | -0.08 |
| 33 | economic support | 1.46 | [0.0, 4.35] | -0.08 |
| 34 | European Centre for Disease Prevention and Con... | 1.46 | [0.0, 4.35] | -0.08 |
| 35 | health service | 1.46 | [0.0, 4.35] | -0.08 |
| 36 | research and development | 1.46 | [0.0, 4.35] | -0.08 |
| 37 | social participation | 1.46 | [0.0, 4.35] | -0.08 |
| 38 | public health | 1.46 | [0.0, 4.35] | -0.08 |
| 39 | occupational health | 1.46 | [0.0, 4.35] | -0.08 |
| 40 | disease prevention | 1.46 | [0.0, 4.35] | -0.08 |
| 41 | state of emergency | 1.46 | [0.0, 4.35] | -0.08 |
Underrepresented records from column : keyword
| keyword | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 42 | distance learning | 1.22 | [0.0, 3.87] | -0.44 |
| 43 | labour market | 1.22 | [0.0, 3.87] | -0.44 |
| 44 | disease surveillance | 1.22 | [0.0, 3.87] | -0.44 |
| 45 | standard of living | 1.22 | [0.0, 3.87] | -0.44 |
| 46 | public hygiene | 1.22 | [0.0, 3.87] | -0.44 |
| 47 | infectious disease | 1.22 | [0.0, 3.87] | -0.44 |
| 48 | social sciences | 1.22 | [0.0, 3.87] | -0.44 |
| 49 | viral disease | 1.22 | [0.0, 3.87] | -0.44 |
| 50 | self-regulation | 1.22 | [0.0, 3.87] | -0.44 |
| 51 | patient rights | 0.98 | [0.0, 3.36] | -0.80 |
| 52 | economic consequence | 0.73 | [0.0, 2.78] | -1.17 |
| 53 | epidemiology | 0.73 | [0.0, 2.78] | -1.17 |
| 54 | virus | 0.73 | [0.0, 2.78] | -1.17 |
| 55 | coronavirus disease | 0.49 | [0.0, 2.17] | -1.53 |
| 56 | social situation | 0.49 | [0.0, 2.17] | -1.53 |
| 57 | free movement of persons | 0.49 | [0.0, 2.17] | -1.53 |
| 58 | freedom of movement | 0.49 | [0.0, 2.17] | -1.53 |
| 59 | aid to undertakings | 0.49 | [0.0, 2.17] | -1.53 |
| 60 | illness | 0.49 | [0.0, 2.17] | -1.53 |
| 61 | medical research | 0.49 | [0.0, 2.17] | -1.53 |
| 62 | hospital infection | 0.24 | [0.0, 1.42] | -1.90 |
| 63 | economic aid | 0.24 | [0.0, 1.42] | -1.90 |
| 64 | endemic disease | 0.24 | [0.0, 1.42] | -1.90 |
| 65 | restriction of liberty | 0.24 | [0.0, 1.42] | -1.90 |
Confidence Interval for page_type is : [-1.39%, 29.96%]
| page_type | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Press release | 57.56 | [20.95, 94.17] | 2.21 |
| 1 | Publication | 26.83 | [0.0, 59.65] | 0.64 |
| 2 | Speech | 7.32 | [0.0, 26.62] | -0.36 |
| 3 | News | 4.88 | [0.0, 20.84] | -0.48 |
| 4 | Collection | 1.95 | [0.0, 12.19] | -0.63 |
| 5 | 1.22 | [0.0, 9.35] | -0.67 | |
| 6 | Form | 0.24 | [0.0, 3.86] | -0.72 |
Overrepresented records from column : page_type
| page_type | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 0 | Press release | 57.56 | [20.95, 94.17] | 2.21 |
Normal represented records from column : page_type
| page_type | Relative freq | Confidence Interval | z_score | |
|---|---|---|---|---|
| 1 | Publication | 26.83 | [0.0, 59.65] | 0.64 |
| 2 | Speech | 7.32 | [0.0, 26.62] | -0.36 |
| 3 | News | 4.88 | [0.0, 20.84] | -0.48 |
| 4 | Collection | 1.95 | [0.0, 12.19] | -0.63 |
| 5 | 1.22 | [0.0, 9.35] | -0.67 | |
| 6 | Form | 0.24 | [0.0, 3.86] | -0.72 |
Underrepresented records from column : page_type
| page_type | Relative freq | Confidence Interval | z_score |
|---|
class_collision_in_columns(ireland_action[CATEGORICAL_COLUMNS])
class_collision(ireland_action[CATEGORICAL_COLUMNS])
Collision in dataframe